From befe25d2ca43017345fe13e042106be1017de51f Mon Sep 17 00:00:00 2001 From: "kaf24@firebug.cl.cam.ac.uk" Date: Fri, 18 Mar 2005 15:14:45 +0000 Subject: [PATCH] bitkeeper revision 1.1236.1.85 (423af065m4e0j4eXiTFvV-BrIlfC-A) Linux 2.6 now always uses writable page tables (even SMP builds). Also use native definitions for atomic read-modify-write operations on ptes. Fixed instruction emulator in Xen. Signed-off-by: Keir Fraser --- .../include/asm-xen/asm-i386/page.h | 14 ++----- .../include/asm-xen/asm-i386/pgtable-2level.h | 34 ++------------- .../include/asm-xen/asm-i386/pgtable.h | 41 ++++--------------- tools/tests/test_x86_emulator.c | 15 +++++++ xen/arch/x86/mm.c | 30 ++++++++------ xen/arch/x86/x86_emulate.c | 27 ++++++------ xen/include/asm-x86/page.h | 21 ++++++++-- xen/include/asm-x86/x86_32/page.h | 5 ++- xen/include/asm-x86/x86_64/page.h | 9 ++-- 9 files changed, 89 insertions(+), 107 deletions(-) diff --git a/linux-2.6.11-xen-sparse/include/asm-xen/asm-i386/page.h b/linux-2.6.11-xen-sparse/include/asm-xen/asm-i386/page.h index e3dfb002c7..345b8264b8 100644 --- a/linux-2.6.11-xen-sparse/include/asm-xen/asm-i386/page.h +++ b/linux-2.6.11-xen-sparse/include/asm-xen/asm-i386/page.h @@ -116,17 +116,11 @@ static inline unsigned long pgd_val(pgd_t x) } #define pgprot_val(x) ((x).pgprot) -static inline pte_t __pte(unsigned long x) -{ - if (x & 1) x = phys_to_machine(x); - return ((pte_t) { (x) }); -} +#define __pte(x) ({ unsigned long _x = (x); \ + (((_x)&1) ? ((pte_t) {phys_to_machine(_x)}) : ((pte_t) {(_x)})); }) #define __pte_ma(x) ((pte_t) { (x) } ) -static inline pgd_t __pgd(unsigned long x) -{ - if ((x & 1)) x = phys_to_machine(x); - return ((pgd_t) { (x) }); -} +#define __pgd(x) ({ unsigned long _x = (x); \ + (((_x)&1) ? ((pgd_t) {phys_to_machine(_x)}) : ((pgd_t) {(_x)})); }) #define __pgprot(x) ((pgprot_t) { (x) } ) #endif /* !__ASSEMBLY__ */ diff --git a/linux-2.6.11-xen-sparse/include/asm-xen/asm-i386/pgtable-2level.h b/linux-2.6.11-xen-sparse/include/asm-xen/asm-i386/pgtable-2level.h index 2b1c7fdfcd..d35d1c00e3 100644 --- a/linux-2.6.11-xen-sparse/include/asm-xen/asm-i386/pgtable-2level.h +++ b/linux-2.6.11-xen-sparse/include/asm-xen/asm-i386/pgtable-2level.h @@ -13,41 +13,13 @@ * within a page table are directly modified. Thus, the following * hook is made available. */ -#define set_pte_batched(pteptr, pteval) \ - queue_l1_entry_update(pteptr, (pteval).pte_low) - -#ifdef CONFIG_SMP -#define set_pte(pteptr, pteval) xen_l1_entry_update(pteptr, (pteval).pte_low) -#if 0 -do { \ - (*(pteptr) = pteval); \ - HYPERVISOR_xen_version(0); \ -} while (0) -#endif -#define set_pte_atomic(pteptr, pteval) set_pte(pteptr, pteval) -#else #define set_pte(pteptr, pteval) (*(pteptr) = pteval) #define set_pte_atomic(pteptr, pteval) set_pte(pteptr,pteval) -#endif #define set_pmd(pmdptr, pmdval) xen_l2_entry_update((pmdptr), (pmdval)) +#define set_pte_batched(pteptr, pteval) \ + queue_l1_entry_update(pteptr, (pteval).pte_low) -/* - * A note on implementation of this atomic 'get-and-clear' operation. - * This is actually very simple because Xen Linux can only run on a single - * processor. Therefore, we cannot race other processors setting the 'accessed' - * or 'dirty' bits on a page-table entry. - * Even if pages are shared between domains, that is not a problem because - * each domain will have separate page tables, with their own versions of - * accessed & dirty state. - */ -static inline pte_t ptep_get_and_clear(pte_t *xp) -{ - pte_t pte = *xp; - if (pte.pte_low) - set_pte(xp, __pte_ma(0)); - return pte; -} - +#define ptep_get_and_clear(xp) __pte_ma(xchg(&(xp)->pte_low, 0)) #define pte_same(a, b) ((a).pte_low == (b).pte_low) /* * We detect special mappings in one of two ways: diff --git a/linux-2.6.11-xen-sparse/include/asm-xen/asm-i386/pgtable.h b/linux-2.6.11-xen-sparse/include/asm-xen/asm-i386/pgtable.h index 29f28ad533..7e40d708d2 100644 --- a/linux-2.6.11-xen-sparse/include/asm-xen/asm-i386/pgtable.h +++ b/linux-2.6.11-xen-sparse/include/asm-xen/asm-i386/pgtable.h @@ -89,9 +89,6 @@ void paging_init(void); # define VMALLOC_END (FIXADDR_START-2*PAGE_SIZE) #endif -extern void *high_memory; -extern unsigned long vmalloc_earlyreserve; - /* * The 4MB page is guessing.. Detailed in the infamous "Chapter H" * of the Pentium details, but assuming intel did the straightforward @@ -214,7 +211,7 @@ extern unsigned long pg0[]; /* pmd_present doesn't just test the _PAGE_PRESENT bit since wr.p.t. can temporarily clear it. */ #define pmd_present(x) (pmd_val(x)) -/* pmd_clear below */ +#define pmd_clear(xp) do { set_pmd(xp, __pmd(0)); } while (0) #define pmd_bad(x) ((pmd_val(x) & (~PAGE_MASK & ~_PAGE_USER & ~_PAGE_PRESENT)) != (_KERNPG_TABLE & ~_PAGE_PRESENT)) @@ -254,34 +251,20 @@ static inline pte_t pte_mkwrite(pte_t pte) { (pte).pte_low |= _PAGE_RW; return p static inline int ptep_test_and_clear_dirty(pte_t *ptep) { - pte_t pte = *ptep; - int ret = pte_dirty(pte); - if (ret) - xen_l1_entry_update(ptep, pte_mkclean(pte).pte_low); - return ret; + if (!pte_dirty(*ptep)) + return 0; + return test_and_clear_bit(_PAGE_BIT_DIRTY, &ptep->pte_low); } static inline int ptep_test_and_clear_young(pte_t *ptep) { - pte_t pte = *ptep; - int ret = pte_young(pte); - if (ret) - xen_l1_entry_update(ptep, pte_mkold(pte).pte_low); - return ret; + if (!pte_young(*ptep)) + return 0; + return test_and_clear_bit(_PAGE_BIT_ACCESSED, &ptep->pte_low); } -static inline void ptep_set_wrprotect(pte_t *ptep) -{ - pte_t pte = *ptep; - if (pte_write(pte)) - set_pte(ptep, pte_wrprotect(pte)); -} -static inline void ptep_mkdirty(pte_t *ptep) -{ - pte_t pte = *ptep; - if (!pte_dirty(pte)) - xen_l1_entry_update(ptep, pte_mkdirty(pte).pte_low); -} +static inline void ptep_set_wrprotect(pte_t *ptep) { clear_bit(_PAGE_BIT_RW, &ptep->pte_low); } +static inline void ptep_mkdirty(pte_t *ptep) { set_bit(_PAGE_BIT_DIRTY, &ptep->pte_low); } /* * Macro to mark a page protection value as "uncacheable". On processors which do not support @@ -316,11 +299,6 @@ static inline pte_t pte_modify(pte_t pte, pgprot_t newprot) #define page_pte(page) page_pte_prot(page, __pgprot(0)) -#define pmd_clear(xp) do { \ - set_pmd(xp, __pmd(0)); \ - xen_flush_page_update_queue(); \ -} while (0) - #define pmd_large(pmd) \ ((pmd_val(pmd) & (_PAGE_PSE|_PAGE_PRESENT)) == (_PAGE_PSE|_PAGE_PRESENT)) @@ -416,7 +394,6 @@ extern void noexec_setup(const char *str); */ #define update_mmu_cache(vma,address,pte) do { } while (0) #define __HAVE_ARCH_PTEP_SET_ACCESS_FLAGS - #define ptep_set_access_flags(__vma, __address, __ptep, __entry, __dirty) \ do { \ if (__dirty) { \ diff --git a/tools/tests/test_x86_emulator.c b/tools/tests/test_x86_emulator.c index e42d598943..25a4a4bcbe 100644 --- a/tools/tests/test_x86_emulator.c +++ b/tools/tests/test_x86_emulator.c @@ -158,6 +158,21 @@ int main(int argc, char **argv) goto fail; printf("okay\n"); + printf("%-40s", "Testing btrl $0x1,(%edi)..."); + instr[0] = 0x0f; instr[1] = 0xba; instr[2] = 0x37; instr[3] = 0x01; + res = 0x2233445F; + regs.eflags = 0x200; + regs.eip = (unsigned long)&instr[0]; + regs.edi = (unsigned long)&res; + cr2 = regs.edi; + rc = x86_emulate_memop(®s, cr2, &emulops, 4); + if ( (rc != 0) || + (res != 0x2233445D) || + ((regs.eflags&0x201) != 0x201) || + (regs.eip != (unsigned long)&instr[4]) ) + goto fail; + printf("okay\n"); + return 0; fail: diff --git a/xen/arch/x86/mm.c b/xen/arch/x86/mm.c index 166dd8ff07..1b2cfd5922 100644 --- a/xen/arch/x86/mm.c +++ b/xen/arch/x86/mm.c @@ -2560,18 +2560,15 @@ static struct x86_mem_emulator ptwr_mem_emulator = { /* Write page fault handler: check if guest is trying to modify a PTE. */ int ptwr_do_page_fault(unsigned long addr) { - unsigned long pte, pfn, l2e; - struct pfn_info *page; - l2_pgentry_t *pl2e; - int which, cpu = smp_processor_id(); - u32 l2_idx; - -#ifdef __x86_64__ - return 0; /* Writable pagetables need fixing for x86_64. */ -#endif + unsigned long pte, pfn, l2e; + struct pfn_info *page; + l2_pgentry_t *pl2e; + int which, cpu = smp_processor_id(); + u32 l2_idx; + struct exec_domain *ed = current; /* Can't use linear_l2_table with external tables. */ - BUG_ON(shadow_mode_external(current->domain)); + BUG_ON(shadow_mode_external(ed->domain)); /* * Attempt to read the PTE that maps the VA being accessed. By checking for @@ -2595,6 +2592,15 @@ int ptwr_do_page_fault(unsigned long addr) return 0; } + /* x86/64: Writable pagetable code needs auditing. Use emulator for now. */ +#if defined(__x86_64__) + goto emulate; +#endif + + /* Writable pagetables are not yet SMP safe. Use emulator for now. */ + if ( (ed->eid != 0) || (ed->ed_next_list != NULL) ) + goto emulate; + /* Get the L2 index at which this L1 p.t. is always mapped. */ l2_idx = page->u.inuse.type_info & PGT_va_mask; if ( unlikely(l2_idx >= PGT_va_unknown) ) @@ -2640,7 +2646,7 @@ int ptwr_do_page_fault(unsigned long addr) * If last batch made no updates then we are probably stuck. Emulate this * update to ensure we make progress. */ - if ( (ptwr_info[cpu].ptinfo[which].prev_exec_domain == current) && + if ( (ptwr_info[cpu].ptinfo[which].prev_exec_domain == ed) && (ptwr_info[cpu].ptinfo[which].prev_nr_updates == 0) ) { /* Force non-emul next time, or we can get stuck emulating forever. */ @@ -2653,7 +2659,7 @@ int ptwr_do_page_fault(unsigned long addr) /* For safety, disconnect the L1 p.t. page from current space. */ if ( (which == PTWR_PT_ACTIVE) && - likely(!shadow_mode_enabled(current->domain)) ) + likely(!shadow_mode_enabled(ed->domain)) ) { *pl2e = mk_l2_pgentry(l2e & ~_PAGE_PRESENT); flush_tlb(); /* XXX Multi-CPU guests? */ diff --git a/xen/arch/x86/x86_emulate.c b/xen/arch/x86/x86_emulate.c index 7b959590f9..833e0e48bd 100644 --- a/xen/arch/x86/x86_emulate.c +++ b/xen/arch/x86/x86_emulate.c @@ -18,12 +18,14 @@ typedef int16_t s16; typedef int32_t s32; typedef int64_t s64; #include +#define DPRINTF(_f, _a...) printf( _f , ## _a ) #else #include #include #include #include #include +#define DPRINTF DPRINTK #endif #include @@ -226,22 +228,25 @@ struct operand { #define EFLAGS_MASK (EFLG_OF|EFLG_SF|EFLG_ZF|EFLG_AF|EFLG_PF|EFLG_CF) /* Before executing instruction: restore necessary bits in EFLAGS. */ -/* EFLAGS = (_sav & _msk) | (EFLAGS & ~_msk); _sav &= ~msk; */ #define _PRE_EFLAGS(_sav, _msk, _tmp) \ +/* EFLAGS = (_sav & _msk) | (EFLAGS & ~_msk); */\ "push %"_sav"; " \ "movl %"_msk",%"_LO32 _tmp"; " \ "andl %"_LO32 _tmp",("_STK"); " \ -"notl %"_LO32 _tmp"; " \ -"andl %"_LO32 _tmp",%"_sav"; " \ "pushf; " \ +"notl %"_LO32 _tmp"; " \ "andl %"_LO32 _tmp",("_STK"); " \ "pop %"_tmp"; " \ "orl %"_LO32 _tmp",("_STK"); " \ -"popf; " +"popf; " \ +/* _sav &= ~msk; */ \ +"movl %"_msk",%"_LO32 _tmp"; " \ +"notl %"_LO32 _tmp"; " \ +"andl %"_LO32 _tmp",%"_sav"; " /* After executing instruction: write-back necessary bits in EFLAGS. */ -/* _sav |= EFLAGS & _msk; */ #define _POST_EFLAGS(_sav, _msk, _tmp) \ +/* _sav |= EFLAGS & _msk; */ \ "pushf; " \ "pop %"_tmp"; " \ "andl %"_msk",%"_LO32 _tmp"; " \ @@ -370,8 +375,6 @@ do{ __asm__ __volatile__ ( \ (_type)_x; \ }) -#define DPRINTF(_f, _a...) printf( _f , ## _a ) - void * decode_register( u8 modrm_reg, struct xen_regs *regs, int highbyte_regs) @@ -932,23 +935,23 @@ x86_emulate_memop( } break; case 0xa3: bt: /* bt */ - src.val &= (1UL << (1 << dst.bytes)) - 1; /* only subword offset */ + src.val &= (dst.bytes << 3) - 1; /* only subword offset */ emulate_2op_SrcV_nobyte("bt", src, dst, _regs.eflags); break; case 0xb3: btr: /* btr */ - src.val &= (1UL << (1 << dst.bytes)) - 1; /* only subword offset */ + src.val &= (dst.bytes << 3) - 1; /* only subword offset */ emulate_2op_SrcV_nobyte("btr", src, dst, _regs.eflags); break; case 0xab: bts: /* bts */ - src.val &= (1UL << (1 << dst.bytes)) - 1; /* only subword offset */ + src.val &= (dst.bytes << 3) - 1; /* only subword offset */ emulate_2op_SrcV_nobyte("bts", src, dst, _regs.eflags); break; case 0xbb: btc: /* btc */ - src.val &= (1UL << (1 << dst.bytes)) - 1; /* only subword offset */ + src.val &= (dst.bytes << 3) - 1; /* only subword offset */ emulate_2op_SrcV_nobyte("btc", src, dst, _regs.eflags); break; case 0xba: /* Grp8 */ - switch ( modrm_reg >> 1 ) + switch ( modrm_reg & 3 ) { case 0: goto bt; case 1: goto bts; diff --git a/xen/include/asm-x86/page.h b/xen/include/asm-x86/page.h index 6ec5dc98e3..78e2648b54 100644 --- a/xen/include/asm-x86/page.h +++ b/xen/include/asm-x86/page.h @@ -56,10 +56,23 @@ typedef struct { unsigned long pt_lo; } pagetable_t; #include #include -#define linear_pg_table ((l1_pgentry_t *)LINEAR_PT_VIRT_START) -#define linear_l2_table ((l2_pgentry_t *)(LINEAR_PT_VIRT_START+(LINEAR_PT_VIRT_START>>(L2_PAGETABLE_SHIFT-L1_PAGETABLE_SHIFT)))) - -#define va_to_l1mfn(_va) (l2_pgentry_val(linear_l2_table[_va>>L2_PAGETABLE_SHIFT]) >> PAGE_SHIFT) +#define linear_l1_table \ + ((l1_pgentry_t *)(LINEAR_PT_VIRT_START)) +#define linear_l2_table \ + ((l2_pgentry_t *)(LINEAR_PT_VIRT_START + \ + (LINEAR_PT_VIRT_START >> (PAGETABLE_ORDER<<0)))) +#define linear_l3_table \ + ((l3_pgentry_t *)(LINEAR_PT_VIRT_START + \ + (LINEAR_PT_VIRT_START >> (PAGETABLE_ORDER<<0)) + \ + (LINEAR_PT_VIRT_START >> (PAGETABLE_ORDER<<1)))) +#define linear_l4_table \ + ((l4_pgentry_t *)(LINEAR_PT_VIRT_START + \ + (LINEAR_PT_VIRT_START >> (PAGETABLE_ORDER<<0)) + \ + (LINEAR_PT_VIRT_START >> (PAGETABLE_ORDER<<1)) + \ + (LINEAR_PT_VIRT_START >> (PAGETABLE_ORDER<<2)))) +#define linear_pg_table linear_l1_table +#define va_to_l1mfn(_va) \ + (l2_pgentry_val(linear_l2_table[_va>>L2_PAGETABLE_SHIFT]) >> PAGE_SHIFT) extern root_pgentry_t idle_pg_table[ROOT_PAGETABLE_ENTRIES]; diff --git a/xen/include/asm-x86/x86_32/page.h b/xen/include/asm-x86/x86_32/page.h index e28a7b65db..a86b6645a7 100644 --- a/xen/include/asm-x86/x86_32/page.h +++ b/xen/include/asm-x86/x86_32/page.h @@ -7,8 +7,9 @@ #define PAGE_SHIFT L1_PAGETABLE_SHIFT #define ROOT_PAGETABLE_SHIFT L2_PAGETABLE_SHIFT -#define L1_PAGETABLE_ENTRIES 1024 -#define L2_PAGETABLE_ENTRIES 1024 +#define PAGETABLE_ORDER 10 +#define L1_PAGETABLE_ENTRIES (1<